## Warning: package 'rcompanion' was built under R version 4.0.3
## Warning: package 'clustMixType' was built under R version 4.0.3
library(arules)
library(arulesViz)
library(plotly)
library(abind)
library(timeDate)
library(rvest)
# Plot Theme
theme_set <- theme(legend.key = element_rect(fill="black"),
legend.background = element_rect(color="white", fill="#263238"),
plot.subtitle = element_text(size=6, color="white"),
panel.background = element_rect(fill="#dddddd"),
panel.border = element_rect(fill=NA),
panel.grid.minor.x = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.major.y = element_line(color="darkgrey", linetype=2),
panel.grid.minor.y = element_blank(),
plot.background = element_rect(fill="#263238"),
text = element_text(color="white"),
axis.text = element_text(color="white"))Source https://www.kaggle.com/xiaowenlimarketing/international-student-time-management
## [1] "Number" "Age" "Gender" "Nationality" "Program"
## [6] "Course" "English" "Academic" "Attendance" "X6"
## [11] "X7" "X8" "X9" "X10" "X11"
## [16] "X12" "X13" "X14" "X15" "X16"
## [21] "X17"
Where X6-X17 are You often feel that your life is aimless, with no definite purpose You never have trouble organizing the things you have to do? Once you’ve started an activity, you persist at it until you’ve completed it Sometimes you feel that the things you have to do during the day just don’t seem to matter You will plan your activities from day to day. You tend to leave things to the last minute? You tend to change rather aimlessly from one activity to another during the day. You give up the things that you planning to do just because your friend says no. You think you do enough with your time. You are easy to get bored with your day-today activities. The important interests/activities in your life tend to change frequently. You know how much time you spend on each of the homework I do.
# Subset the relevant data for the cleaning and transformation
data <- original_data %>%
select(-c(Number,Nationality,Program,Course,
English,Academic,Attendance))# Construct the 'clean' function
clean <- function(x){
# Reduce factor levels
x$Age <- ifelse(x$Age == "26-30" | x$Age == ">36" | x$Age == "31-35", ">25",x$Age)
x$Age <- ifelse(x$Age == "<18", "18-20",x$Age)
x$Age <- ordered(x$Age, levels=c("18-20","21-25",">25"))
# Change " " into Neither
x$X6 <- ifelse(x$X6=="","Neither",x$X6)
x$X8 <- ifelse(x$X8=="","Neither",x$X8)
x$X9 <- ifelse(x$X9=="","Neither",x$X9)
x$X10 <- ifelse(x$X10=="","Neither",x$X10)
x$X11 <- ifelse(x$X11=="","Neither",x$X11)
x$X12 <- ifelse(x$X12=="","Neither",x$X12)
x$X14 <- ifelse(x$X14=="","Neither",x$X14)
x$X16 <- ifelse(x$X16=="","Neither",x$X16)
x$X17 <- ifelse(x$X17=="","Neither",x$X17)
# Transform the character into Likert Scale
x$X6_num <- ifelse(x$X6=="Strong Agree",1,
ifelse(x$X6=="Agree",2,
ifelse(x$X6=="Neither",3,
ifelse(x$X6=="Disagree",4,5))))
x$X7_num <- ifelse(x$X7=="Strong Agree",5,
ifelse(x$X7=="Agree",4,
ifelse(x$X7=="Neither",3,
ifelse(x$X7=="Disagree",2,1))))
x$X8_num <- ifelse(x$X8=="Strong Agree",5,
ifelse(x$X8=="Agree",4,
ifelse(x$X8=="Neither",3,
ifelse(x$X8=="Disagree",2,1))))
x$X9_num <- ifelse(x$X9=="Strong Agree",1,
ifelse(x$X9=="Agree",2,
ifelse(x$X9=="Neither",3,
ifelse(x$X9=="Disagree",4,5))))
x$X10_num <- ifelse(x$X10=="Strong Agree",5,
ifelse(x$X10=="Agree",4,
ifelse(x$X10=="Neither",3,
ifelse(x$X10=="Disagree",2,1))))
x$X11_num <- ifelse(x$X11=="Strong Agree",1,
ifelse(x$X11=="Agree",2,
ifelse(x$X11=="Neither",3,
ifelse(x$X11=="Disagree",4,5))))
x$X12_num <- ifelse(x$X12=="Strong Agree",1,
ifelse(x$X12=="Agree",2,
ifelse(x$X12=="Neither",3,
ifelse(x$X12=="Disagree",4,5))))
x$X13_num <- ifelse(x$X13=="Strong Agree",1,
ifelse(x$X13=="Agree",2,
ifelse(x$X13=="Neither",3,
ifelse(x$X13=="Disagree",4,5))))
x$X14_num <- ifelse(x$X14=="Strong Agree",5,
ifelse(x$X14=="Agree",4,
ifelse(x$X14=="Neither",3,
ifelse(x$X14=="Disagree",2,1))))
x$X15_num <- ifelse(x$X15=="Strong Agree",1,
ifelse(x$X15=="Agree",2,
ifelse(x$X15=="Neither",3,
ifelse(x$X15=="Disagree",4,5))))
x$X16_num <- ifelse(x$X16=="Strong Agree",1,
ifelse(x$X16=="Agree",2,
ifelse(x$X16=="Neither",3,
ifelse(x$X16=="Disagree",4,5))))
x$X17_num <- ifelse(x$X17=="Strong Agree",5,
ifelse(x$X17=="Agree",4,
ifelse(x$X17=="Neither",3,
ifelse(x$X17=="Disagree",2,1))))
# Sum the total score
x$score <- x$X6_num + x$X7_num + x$X8_num + x$X9_num +
x$X10_num + x$X11_num +x$X12_num + x$X13_num +
x$X14_num +x$X15_num + x$X16_num +x$X17_num
# Character into Factor
x <- x %>%
mutate_if(is.character,as.factor)
}Note: should check one more time about the meaning of the likert scale, can we conclude that as we have a higher total score, then it is better? or worse?
# Subset the data for manipulation
data_ques <- data %>% select(c(15:27))
# Rename the variable's name
colnames(data_ques) <- c("X6","X7","X8","X9","X10",
"X11","X12","X13","X14","X15",
"X16","X17","Score")# Visualize the Total Score distribution
ggplot(data, aes(x = score))+
geom_histogram(col = "lightblue")+
theme_set + ggtitle("Total Score Distribution")## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Distribution = okay
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 26.00 34.00 37.00 37.36 41.00 55.00
If the mean and median is close, we are on the right track.
# Checking the Age Cummulation based on Gender
ggplot(data, aes(x = Age)) +
geom_bar(aes(x = Age, fill = Gender), position = "dodge") +
theme_set + ggtitle("Histogram of Age based on Gender") Insight: Students with age interval between 21-25 are better in time management? (Check note)
# Checking Score Distribution based on Gender and Age
ggplot(data, aes(x = Age, y = score)) +
geom_boxplot(aes(fill = Gender)) +
theme_set + ggtitle("Score Distribution based on Gender and Age") # Determine the number of clustering use the Elbow Plot
set.seed(100)
# wss function to calculate the total distance within member
wss <- function(k) {
kproto(data_kmean, k)$tot.withinss
}
# Define k = 1 to 10
k.values <- 1:10## # NAs in variables:
## Age Gender X6 X7 X8 X9 X10 X11 X12 X13 X14
## 0 0 0 0 0 0 0 0 0 0 0
## X15 X16 X17 score
## 0 0 0 0
## 0 observation(s) with NAs.
##
## Estimated lambda: 40.0875
##
## # NAs in variables:
## Age Gender X6 X7 X8 X9 X10 X11 X12 X13 X14
## 0 0 0 0 0 0 0 0 0 0 0
## X15 X16 X17 score
## 0 0 0 0
## 0 observation(s) with NAs.
##
## Estimated lambda: 40.0875
##
## # NAs in variables:
## Age Gender X6 X7 X8 X9 X10 X11 X12 X13 X14
## 0 0 0 0 0 0 0 0 0 0 0
## X15 X16 X17 score
## 0 0 0 0
## 0 observation(s) with NAs.
##
## Estimated lambda: 40.0875
##
## # NAs in variables:
## Age Gender X6 X7 X8 X9 X10 X11 X12 X13 X14
## 0 0 0 0 0 0 0 0 0 0 0
## X15 X16 X17 score
## 0 0 0 0
## 0 observation(s) with NAs.
##
## Estimated lambda: 40.0875
##
## # NAs in variables:
## Age Gender X6 X7 X8 X9 X10 X11 X12 X13 X14
## 0 0 0 0 0 0 0 0 0 0 0
## X15 X16 X17 score
## 0 0 0 0
## 0 observation(s) with NAs.
##
## Estimated lambda: 40.0875
##
## # NAs in variables:
## Age Gender X6 X7 X8 X9 X10 X11 X12 X13 X14
## 0 0 0 0 0 0 0 0 0 0 0
## X15 X16 X17 score
## 0 0 0 0
## 0 observation(s) with NAs.
##
## Estimated lambda: 40.0875
##
## # NAs in variables:
## Age Gender X6 X7 X8 X9 X10 X11 X12 X13 X14
## 0 0 0 0 0 0 0 0 0 0 0
## X15 X16 X17 score
## 0 0 0 0
## 0 observation(s) with NAs.
##
## Estimated lambda: 40.0875
##
## # NAs in variables:
## Age Gender X6 X7 X8 X9 X10 X11 X12 X13 X14
## 0 0 0 0 0 0 0 0 0 0 0
## X15 X16 X17 score
## 0 0 0 0
## 0 observation(s) with NAs.
##
## Estimated lambda: 40.0875
##
## # NAs in variables:
## Age Gender X6 X7 X8 X9 X10 X11 X12 X13 X14
## 0 0 0 0 0 0 0 0 0 0 0
## X15 X16 X17 score
## 0 0 0 0
## 0 observation(s) with NAs.
##
## Estimated lambda: 40.0875
##
## # NAs in variables:
## Age Gender X6 X7 X8 X9 X10 X11 X12 X13 X14
## 0 0 0 0 0 0 0 0 0 0 0
## X15 X16 X17 score
## 0 0 0 0
## 0 observation(s) with NAs.
##
## Estimated lambda: 40.0875
Why 3? Reason:
# Clustering Model
set.seed(100)
# Set the model with 3 clusters
model_clus <- kproto(data_kmean, 3)## # NAs in variables:
## Age Gender X6 X7 X8 X9 X10 X11 X12 X13 X14
## 0 0 0 0 0 0 0 0 0 0 0
## X15 X16 X17 score
## 0 0 0 0
## 0 observation(s) with NAs.
##
## Estimated lambda: 40.0875
# Input new data as a (for simulation only)
a <- c("21-25", "F", "Strong Agree","Strong Disagree","Strong Disagree",
"Strong Agree","Strong Disagree", "Strong Agree","Strong Agree",
"Strong Agree","Disagree","Strong Disagree","Strong Disagree",
"Strong Disagree")# New data pre-processing
# Change the input into a dataframe
a <- data.frame(a)
# Transpose the input data
a <- data.frame(t(a))
# Rename the columns
colnames(a) <- c("Age","Gender","X6","X7","X8",
"X9","X10","X11","X12","X13",
"X14","X15","X16","X17")
# Apply the function
a <- clean(a)
# Since the kproton uses min of 2 observations,
# we bind it with the last observation from the original data
a <- rbind(data[125,],a)
# Subset the input data
a_kmean <- a %>%
select(-c(15:26))# Select the clustering result
cluster <- append(model_clus$cluster, test$cluster)
cluster <- as.factor(cluster)
# Make a new data frame from original data + input data
newdata <- rbind(data,a)
# Column bind data 2 + cluster result
data_clus <- cbind(newdata, cluster)
# A new data frame only the 'input data' for visualization needs
data_new <- data_clus[127,]Note: avoid use data1 data2, not good for reader
#For not overlaping
set.seed(100)
jitter <- position_jitter(width = 0.1, height = 0.1)
#plot clustering original
ggplot(NULL, aes(x = Age, y = score))+
geom_point(data = data_kmean, aes(col = as.factor(model_clus$cluster)),
position = jitter) +
theme(legend.position = "bottom") +
guides(fill=FALSE) +
scale_colour_manual(name = "Cluster",
labels = c("Bad", "Good","Normal"),
values = c("red","dark green", "gold")) +
theme_set + ggtitle("Clustering Result")# Boxplot of cluster and score (Optional = I prefer this plotly)
boxplot_cluster <- ggplot(data_kmean, aes(x = as.factor(model_clus$cluster), y = score)) +
geom_boxplot()
ggplotly(boxplot_cluster)# Plot clustering with new data
ggplot(NULL,aes(x=Age,y=score))+
geom_point(data=data_clus , aes(col=cluster), position = jitter) +
geom_jitter(data = data_new, aes(col=cluster), shape=8, size=3, stroke=2) +
theme(legend.position = "none") +
guides(fill=FALSE) +
scale_colour_manual(name="Cluster", labels=c("Bad", "Good","Normal"),
values=c("red","dark green", "gold")) +
theme_set + ggtitle("") Note: found the red star, means? What is the title for this plot?
# Take only the survey results & Cluster
data_arules <- data_clus %>%
select(c(3:14,28)) # Original survey results and 28 for the cluster identity
data_arules$cluster <- ifelse(data_arules$cluster==1, "Bad",
ifelse(data_arules$cluster==2, "Good", "Normal"))## Rows: 127
## Columns: 28
## $ Age <ord> >25, >25, >25, 21-25, 21-25, 21-25, 21-25, 21-25, 21-25, 21...
## $ Gender <fct> M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, M,...
## $ X6 <fct> Disagree, Strong Agree, Disagree, Disagree, Neither, Agree,...
## $ X7 <fct> Agree, Agree, Strong Agree, Disagree, Disagree, Neither, Ag...
## $ X8 <fct> Strong Agree, Neither, Agree, Agree, Neither, Agree, Agree,...
## $ X9 <fct> Neither, Disagree, Disagree, Agree, Neither, Strong Agree, ...
## $ X10 <fct> Agree, Agree, Agree, Disagree, Disagree, Disagree, Agree, A...
## $ X11 <fct> Neither, Neither, Agree, Agree, Neither, Disagree, Neither,...
## $ X12 <fct> Disagree, Disagree, Disagree, Neither, Neither, Strong Agre...
## $ X13 <fct> Strong Disagree, Strong Disagree, Strong Disagree, Neither,...
## $ X14 <fct> Strong Agree, Neither, Disagree, Neither, Neither, Strong A...
## $ X15 <fct> Neither, Agree, Strong Agree, Neither, Disagree, Strong Agr...
## $ X16 <fct> Disagree, Neither, Strong Agree, Disagree, Neither, Strong ...
## $ X17 <fct> Agree, Disagree, Disagree, Agree, Agree, Strong Agree, Neit...
## $ X6_num <dbl> 4, 1, 4, 4, 3, 2, 4, 2, 4, 5, 5, 3, 4, 2, 4, 1, 3, 4, 4, 2,...
## $ X7_num <dbl> 4, 4, 5, 2, 2, 3, 4, 3, 2, 4, 3, 2, 3, 4, 4, 5, 2, 2, 2, 4,...
## $ X8_num <dbl> 5, 3, 4, 4, 3, 4, 4, 3, 1, 4, 3, 3, 3, 4, 2, 4, 4, 3, 4, 5,...
## $ X9_num <dbl> 3, 4, 4, 2, 3, 1, 3, 2, 2, 3, 3, 3, 3, 2, 2, 3, 2, 2, 4, 2,...
## $ X10_num <dbl> 4, 4, 4, 2, 2, 2, 4, 4, 2, 2, 3, 5, 2, 3, 1, 5, 3, 3, 5, 5,...
## $ X11_num <dbl> 3, 3, 2, 2, 3, 4, 3, 3, 2, 5, 3, 2, 4, 3, 4, 1, 4, 5, 4, 1,...
## $ X12_num <dbl> 4, 4, 4, 3, 3, 1, 3, 2, 2, 5, 3, 3, 4, 2, 1, 2, 4, 3, 4, 2,...
## $ X13_num <dbl> 5, 5, 5, 3, 3, 2, 5, 3, 4, 5, 3, 4, 4, 3, 5, 1, 4, 2, 2, 1,...
## $ X14_num <dbl> 5, 3, 2, 3, 3, 5, 2, 4, 4, 4, 3, 4, 2, 5, 4, 4, 3, 3, 4, 4,...
## $ X15_num <dbl> 3, 2, 1, 3, 4, 1, 5, 2, 4, 2, 3, 2, 4, 3, 3, 3, 4, 1, 4, 2,...
## $ X16_num <dbl> 4, 3, 1, 4, 3, 5, 5, 2, 4, 5, 3, 3, 2, 2, 2, 2, 4, 3, 2, 2,...
## $ X17_num <dbl> 4, 2, 2, 4, 4, 5, 3, 4, 2, 1, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4,...
## $ score <dbl> 48, 38, 38, 36, 36, 35, 45, 34, 33, 45, 38, 38, 39, 37, 34,...
## $ cluster <fct> 2, 3, 2, 3, 3, 3, 2, 3, 1, 2, 3, 3, 2, 3, 1, 3, 2, 3, 2, 1,...
# Apply the association rules with Apriori Algorithm
mba_bad <- apriori(data_arules,
parameter = list(sup = 0.05, conf = 0.5,
target="rules", minlen=3, maxlen=4),
appearance = list(rhs= "cluster=Bad", default = "lhs"))## Warning: Column(s) 13 not logical or factor. Applying default discretization
## (see '? discretizeDF').
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.5 0.1 1 none FALSE TRUE 5 0.05 3
## maxlen target ext
## 4 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 6
##
## set item appearances ...[1 item(s)] done [0.00s].
## set transactions ...[63 item(s), 127 transaction(s)] done [0.00s].
## sorting and recoding items ... [56 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4
## Warning in apriori(data_arules, parameter = list(sup = 0.05, conf = 0.5, :
## Mining stopped (maxlen reached). Only patterns up to a length of 4 returned!
## done [0.00s].
## writing ... [44 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
Note : Got an error about column 13 since it is not logical. But why? Glimpse says it is.
## lhs rhs support confidence coverage lift count
## [1] {X10=Disagree,
## X17=Disagree} => {cluster=Bad} 0.07086614 1.0000000 0.07086614 3.968750 9
## [2] {X10=Disagree,
## X12=Agree,
## X17=Disagree} => {cluster=Bad} 0.05511811 1.0000000 0.05511811 3.968750 7
## [3] {X7=Disagree,
## X10=Disagree,
## X17=Disagree} => {cluster=Bad} 0.06299213 1.0000000 0.06299213 3.968750 8
## [4] {X7=Disagree,
## X11=Agree,
## X17=Disagree} => {cluster=Bad} 0.05511811 1.0000000 0.05511811 3.968750 7
## [5] {X7=Disagree,
## X9=Agree,
## X17=Disagree} => {cluster=Bad} 0.05511811 1.0000000 0.05511811 3.968750 7
## [6] {X7=Disagree,
## X17=Disagree} => {cluster=Bad} 0.09448819 0.9230769 0.10236220 3.663462 12
## [7] {X8=Disagree,
## X17=Disagree} => {cluster=Bad} 0.05511811 0.8750000 0.06299213 3.472656 7
## [8] {X10=Disagree,
## X11=Agree} => {cluster=Bad} 0.05511811 0.8750000 0.06299213 3.472656 7
## [9] {X10=Disagree,
## X12=Agree} => {cluster=Bad} 0.05511811 0.8750000 0.06299213 3.472656 7
## [10] {X10=Disagree,
## X16=Agree} => {cluster=Bad} 0.05511811 0.8750000 0.06299213 3.472656 7
## NULL
Note: focus on the first 5, they have confidence value = 1.0
The scatter plot shows the lift for each rule.
## Warning: 'plotly_arules' is deprecated.
## Use 'plot' instead.
## See help("Deprecated")
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
With this plot, we can see the arrow as a relationship between products. The size of rule is getting bigger as the lift increase.
This representation is also called as Parallel Coordinates Plot. It is useful to visualized which products along with which items cause what kind of sales.
Note: need to transpose/observe/gain deep understanding/making a note for each survey question towards the likert scale to make sure we read this right :)
Note: Please do the same with the following code
mba_good <- apriori(data_arules,
parameter = list(sup = 0.05, conf = 0.5,
target="rules",minlen=3,maxlen=4),
appearance = list(rhs= "cluster=Good", default = "lhs"))## Warning: Column(s) 13 not logical or factor. Applying default discretization
## (see '? discretizeDF').
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.5 0.1 1 none FALSE TRUE 5 0.05 3
## maxlen target ext
## 4 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 6
##
## set item appearances ...[1 item(s)] done [0.00s].
## set transactions ...[63 item(s), 127 transaction(s)] done [0.00s].
## sorting and recoding items ... [56 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4
## Warning in apriori(data_arules, parameter = list(sup = 0.05, conf = 0.5, :
## Mining stopped (maxlen reached). Only patterns up to a length of 4 returned!
## done [0.00s].
## writing ... [269 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
## lhs rhs support confidence coverage lift count
## [1] {X6=Strong Disagree,
## X8=Agree} => {cluster=Good} 0.06299213 1 0.06299213 2.54 8
## [2] {X9=Disagree,
## X16=Disagree} => {cluster=Good} 0.05511811 1 0.05511811 2.54 7
## [3] {X12=Disagree,
## X15=Disagree} => {cluster=Good} 0.06299213 1 0.06299213 2.54 8
## [4] {X15=Disagree,
## X17=Neither} => {cluster=Good} 0.06299213 1 0.06299213 2.54 8
## [5] {X11=Disagree,
## X15=Disagree} => {cluster=Good} 0.08661417 1 0.08661417 2.54 11
## [6] {X11=Disagree,
## X12=Disagree} => {cluster=Good} 0.10236220 1 0.10236220 2.54 13
## [7] {X7=Agree,
## X16=Disagree} => {cluster=Good} 0.06299213 1 0.06299213 2.54 8
## [8] {X8=Agree,
## X12=Disagree,
## X15=Disagree} => {cluster=Good} 0.05511811 1 0.05511811 2.54 7
## [9] {X11=Disagree,
## X15=Disagree,
## X16=Disagree} => {cluster=Good} 0.06299213 1 0.06299213 2.54 8
## [10] {X15=Disagree,
## X16=Disagree,
## X17=Agree} => {cluster=Good} 0.06299213 1 0.06299213 2.54 8
## NULL
link <- paste0("https://quickbooks.intuit.com/r/employee-management/time-management-tips/#:~:text=If%20you%20want%20to%20improve%20your%20time%20management,compare%20actual%20time%20spent%20and%20estimated%20time%20spent.")
pages <- read_html(link)Tips <- pages %>%
html_nodes(".body-article .content-article h2") %>%
html_text()
Tips <- Tips[3:27]
#"or for more info you can clik [here](https://quickbooks.intuit.com/r/employee-management/time-management-tips/#:~:text=If%20you%20want%20to%20improve%20your%20time%20management,compare%20actual%20time%20spent%20and%20estimated%20time%20spent.)" #nanti dicantumin authornya sama linknya kita ambil tips) #thumbs up! well done GB